clear
set more off

cap log close
log using $logdir/get_data/post_sas/get_pre_taxsim.log, replace

*************************************
*	Preliminary quick SOI cleaning	*
*************************************
use $statadir/soi_2019, clear

rename soi_prim_unmasked_tin prim_ssn
rename soi_sec_unmasked_tin sec_ssn

gen double sec_ssn_orig = sec_ssn
qui replace sec_ssn = 0 if soi_fil_stat == 3

// tostring prim_ssn sec_ssn sec_ssn_orig, format(%09.0f) replace force
tempfile soi
qui save `soi', replace

*********************************
*	Also useful: a mapping		*
*	from masked to unmasked,	*
*	among the 2019 filers		*
*********************************
use $fromsasdir/result if nonfiler == 0, clear
keep prim_tin sec_tin prim_ssn sec_ssn
destring prim_ssn sec_ssn, replace force
tempfile mapping
qui save `mapping'



*****************************************************************
*																*
*					Part A: Do dependents match?				*
*																*
*****************************************************************

*************************************
*									*
*		Get SOI dependent info		*
*									*
*************************************


*****************************
*	Regular dependents		*
*****************************

use `soi', clear
forval i = 1/4 {
	rename soi_dep`i'_unmasked_tin dep_ssn`i'
}

keep prim_ssn sec_ssn dep_ssn*


reshape long dep_ssn, i(prim_ssn sec_ssn) j(depnum)
drop depnum
drop if dep_ssn == 0

duplicates drop *, force
tempfile soi_deps
qui save `soi_deps'



*********************
*		EITC		*
*********************

use `soi', clear
forval i = 1/3 {
	rename soi_dep`i'_eitc_unmasked_tin dep_ssn`i'
}


keep prim_ssn sec_ssn dep_ssn*


reshape long dep_ssn, i(prim_ssn sec_ssn) j(depnum)
drop depnum
drop if dep_ssn == 0
duplicates drop *, force

tempfile soi_eitc
qui save `soi_eitc'




*************************************
*									*
*		Get CDW dependent info		*
*									*
*************************************


*****************************
*	Regular dependents		*
*****************************
use $fromsasdir/deps, clear
destring dt, replace force
qui keep if dt <= 4
destring dep_ssn prim_ssn sec_ssn, replace force
qui drop if 2019 - dep_yob == 24 // we will assume that these are no longer dependents in 2019

duplicates drop prim_ssn sec_ssn dep_ssn, force
merge 1:1 prim_ssn sec_ssn dep_ssn using `soi_deps'
gen bad_deps = _merge < 3



/* 
the issue here is if there is a dependent in the SOI that's not in the CDW (using only),
we won't have their prim_tin or sec_tin. So, need to merge it back in with an update.

The master only here are the 2019 non-filers (who may have had dependents in 2018). But 
these guys won't be using only [in the prior merge], so we're okay
*/
merge m:1 sec_ssn prim_ssn  using `mapping', keep(1 3 4) nogen update
gen cdw_depx = inlist(_merge,1,3)
foreach a in 13 17 18 {
	gen cdw_deps`a' = inlist(_merge,1,3) & 2019-dep_yob < `a'
}


gcollapse (sum) cdw_dep* (min) bad_deps, by(prim_tin sec_tin) fast
tempfile deps
qui save `deps'


*********************
*		EITC		*
*********************
use $fromsasdir/deps, clear
merge m:1 prim_tin sec_tin using $fromsasdir/result, keep(3) nogen keepusing(lag_eitc)


destring dt prim_ssn sec_ssn dep_ssn, replace force
gen eic_dep = dt > 4
gen apparent_eic_dep = dt <= 4 & (2019 - dep_yob < 18 | (2019 - dep_yob < 24 & student))

qui keep if (lag_eitc > 0 & eic_dep) | (eic == 0 & apparent_eic_dep)
duplicates drop prim_ssn sec_ssn dep_ssn, force


keep prim_ssn sec_ssn dep_ssn prim_tin sec_tin
merge 1:1 prim_ssn sec_ssn dep_ssn using `soi_eitc'

gen problem = _merge == 2

/* 
the issue here is if there is a dependent in the SOI that's not in the CDW (using only),
we won't have their prim_tin or sec_tin. So, need to merge it back in with an update.

The master only here are the 2019 non-filers (who may have had dependents in 2018). But 
these guys won't be using only [in the prior merge], so we're okay
*/
merge m:1 sec_ssn prim_ssn  using `mapping', keep(1 3 4) nogen update


gcollapse (max) bad_eitc_deps = problem, by(prim_tin sec_tin) fast

tempfile eitc_problem
qui save `eitc_problem'










*********************************************************
*														*
*				Part B. Merge everything else			*
*														*
*********************************************************



*********************
*	Clean the CDW	*
*********************
use $fromsasdir/result, clear
ds prim_ssn sec_ssn prim_tin sec_tin nonfiler yob_nonfiler, not
local vlist = r(varlist)
di "`vlist'"

destring prim_ssn sec_ssn, replace force

*************************
*	Merge dependents	*
*************************
merge 1:1 prim_tin sec_tin using `deps', keep(1 3) gen(depmerge)

drop depmerge
merge 1:1 prim_tin sec_tin using `eitc_problem', keep(1 3) nogen
qui replace bad_eitc_deps = 0 if missing(bad_eitc_deps)


*************************************************
*	Merge the SOI data (by prim_ssn, sec_ssn)	*
*************************************************
merge m:1 prim_ssn sec_ssn using `soi', nogen // this will not be a successful merge for the non-filers!



// should not be any using-only: everyone in SOI should be in CDW.
// there are some master-only's: this is the nonfiler sample [this is why the merge has to be 1:m]
qui replace soi_wgt = 1000 if nonfiler == 1


*************************************
*	Compute # of SOI dependents		*
*************************************
gen soi_num_deps = 0
forval i = 1/10 {
	qui replace soi_num_deps = soi_num_deps + 1 if soi_dep`i'_unmasked_tin > 0
}





*****************************
*		Rename stuff		*
*****************************
foreach v in `vlist' {
	rename `v' cdw_`v'
	cap replace cdw_`v' = 0 if missing(cdw_`v')
}

foreach var of varlist soi_* {
	cap replace `var' = 0 if missing(`var')
}


*************************
*	Final housekeeping	*
*************************
qui replace soi_prim_yob = yob_nonfiler if nonfiler == 1
drop yob_nonfiler

// undo the change we made to the secondary TINs for MFS filers


replace sec_ssn = sec_ssn_orig
drop sec_ssn

rename sec_ssn sec_unmasked_tin
rename prim_ssn prim_unmasked_tin

*************************
*	CDW vs. SOI match	*
*************************

gen byte TEMP_soi14 = (soi_fil_stat==1 | soi_fil_stat==4)
gen byte TEMP_cdw14 = (cdw_filing_status==1 | cdw_filing_status==4)
gen byte soi_cdw_match = 0
replace  soi_cdw_match = 1 if (nonfiler==0 & (cdw_tu_match==1 | (cdw_nonfiler_x==1 & TEMP_soi14==1)))
replace  soi_cdw_match = 1 if nonfiler==1
drop TEMP*


compress
qui save $statadir/pre_taxsim_data, replace






